import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
np.random.seed(42)
df = pd.read_csv('course_page_actions.csv')
df.head()
# Get dataframe with all records from control group
control_df = df.query('group == "control"')
# Compute click through rate for control group
control_ctr = control_df.query('action == "enroll"').id.nunique() / control_df.query('action == "view"').id.nunique()
# Display click through rate
control_ctr
# Get dataframe with all records from experiment group
experiment_df = df.query('group=="experiment"')
# Compute click through rate for experiment group
experiment_ctr = experiment_df.query("action=='enroll'").id.nunique() / experiment_df.query("action=='view'").id.nunique()
# Display click through rate
experiment_ctr
# Compute the observed difference in click through rates
obs_diff = experiment_ctr - control_ctr
# Display observed difference
obs_diff
# Create a sampling distribution of the difference in proportions
# with bootstrapping
diffs = []
size = df.shape[0]
for _ in range(10000):
b_samp = df.sample(size, replace=True)
control_df = b_samp.query('group == "control"')
experiment_df = b_samp.query('group == "experiment"')
control_ctr = control_df.query('action == "enroll"').id.nunique() / control_df.query('action == "view"').id.nunique()
experiment_ctr = experiment_df.query('action == "enroll"').id.nunique() / experiment_df.query('action == "view"').id.nunique()
diffs.append(experiment_ctr - control_ctr)
# Convert to numpy array
diffs = np.array(diffs)
# Plot sampling distribution
plt.hist(diffs)
# Simulate distribution under the null hypothesis
null_vals = np.random.normal(0, diffs.std(), 10000)
# Plot the null distribution
plt.hist(null_vals)
# Plot observed statistic with the null distibution
plt.hist([diffs, null_vals], label= ['diffs', 'null'])
plt.legend(loc='upper left')
plt.show()
plt.hist(null_vals)
plt.axvline(diffs.mean(), color='r')
plt.show()
# Compute p-value.mean()
a = np.array([1,2,3,4,5,6])
(a > 4).mean() + ()